DEXSeq Isoform Analysis

Load Libraries

library(DEXSeq)
library(tidyverse)
library(dplyr)
library(msigdbr)
library(fgsea)
source("99_proj_func.R")

Ocular Motor Neurons

isoCm_Ocular <- read.csv("data/OcularIsoformCountMatrix.csv", row.names = 1)
Ocular_Metadata <- read.csv("data/OcularSampleMetadata.csv", row.names = 1)
IsoformInfo <- read.csv("data/isoformInfo.csv")
OcularResult <- dexseq_analysis(isoCm_Ocular, Ocular_Metadata, IsoformInfo)
converting counts to integer mode
Warning in DESeqDataSet(rse, design, ignoreRank = TRUE): some variables in
design formula are characters, converting to factors
Fit for gene/exon ENSG00000234651 threw the next warning(s): the matrix is either rank-deficient or not positive definite

Compare to Differential Expression:

alpha <- 0.05  # threshold

OcularResultGene <- OcularResult |> 
  as.data.frame() |> 
  as_tibble() |>
  filter(!is.na(pvalue)) |> 
  select(
    gene_id = groupID,
    isoform_id = featureID,
    stat,
    log2FC = log2fold_SOD1_C9ORF72,
    pvalue,
    padj
  ) |> 
  group_by(gene_id) |> 
  slice_min(
    order_by = pvalue,
    n = 1,
    with_ties = FALSE
  )  |> 
  ungroup()  |> 
  mutate(
    padj = p.adjust(pvalue, method = 'fdr')
  ) |>
  filter(!is.na(padj))

head(OcularResultGene)
# A tibble: 6 × 6
  gene_id         isoform_id        stat log2FC   pvalue    padj
  <chr>           <chr>            <dbl>  <dbl>    <dbl>   <dbl>
1 ENSG00000000003 ENST00000612152  0.663 -1.38  0.718    0.802  
2 ENSG00000000419 ENST00000371582 15.4   -0.441 0.000459 0.00457
3 ENSG00000000457 ENST00000423670  4.38   0.632 0.112    0.262  
4 ENSG00000000460 ENST00000459772  2.89   0.462 0.236    0.410  
5 ENSG00000000971 ENST00000695981  2.25   2.06  0.324    0.494  
6 ENSG00000001036 ENST00000451668  5.16  -0.963 0.0757   0.205  
dex_gene <- OcularResultGene |> 
  mutate(gene_id,
         dex_padj = padj,
         dex_sig = !is.na(padj) & padj < alpha)
head(dex_gene)
# A tibble: 6 × 8
  gene_id         isoform_id        stat log2FC  pvalue    padj dex_padj dex_sig
  <chr>           <chr>            <dbl>  <dbl>   <dbl>   <dbl>    <dbl> <lgl>  
1 ENSG00000000003 ENST00000612152  0.663 -1.38  7.18e-1 0.802    0.802   FALSE  
2 ENSG00000000419 ENST00000371582 15.4   -0.441 4.59e-4 0.00457  0.00457 TRUE   
3 ENSG00000000457 ENST00000423670  4.38   0.632 1.12e-1 0.262    0.262   FALSE  
4 ENSG00000000460 ENST00000459772  2.89   0.462 2.36e-1 0.410    0.410   FALSE  
5 ENSG00000000971 ENST00000695981  2.25   2.06  3.24e-1 0.494    0.494   FALSE  
6 ENSG00000001036 ENST00000451668  5.16  -0.963 7.57e-2 0.205    0.205   FALSE  
load("data/Ocular_dseqResult.RData")
OcularDesq <- res_ocular
OcularDesq$padj <- as.numeric(OcularDesq$padj)
OcularDesq <- OcularDesq |>
  as.data.frame() |> 
  rownames_to_column("gene_id") |>
  mutate(deseq_sig = !is.na(padj) & padj < alpha) |>
  filter(!is.na(padj))
head(OcularDesq)
          gene_id     baseMean log2FoldChange     lfcSE        stat    pvalue
1 ENSG00000000003 12926.011840     0.10096750 0.4588043  0.22006659 0.8258193
2 ENSG00000000005    18.828508    -0.61861603 0.8402902 -0.73619336 0.4616130
3 ENSG00000000419  4025.832134     0.21512399 0.2723724  0.78981561 0.4296355
4 ENSG00000000457  1748.634605     0.20570153 0.3076286  0.66866831 0.5037071
5 ENSG00000000460  1272.541630     0.46317296 0.4467637  1.03672916 0.2998621
6 ENSG00000000938     6.719304    -0.05569774 2.1012478 -0.02650698 0.9788530
       padj deseq_sig
1 0.9982948     FALSE
2 0.9982948     FALSE
3 0.9982948     FALSE
4 0.9982948     FALSE
5 0.9982948     FALSE
6 0.9982948     FALSE
# Gene universe = genes present in BOTH results
univ <- intersect(dex_gene$gene_id, OcularDesq$gene_id)
both <- dex_gene |> 
  filter(gene_id %in% univ) |> 
  mutate(gene_id, dex_sig) |> 
  inner_join(OcularDesq |> 
                filter(gene_id %in% univ) |> 
                mutate(gene_id, deseq_sig),
             by = "gene_id")

contig_tab <- table(DEXSeq = both$dex_sig, DESeq2 = both$deseq_sig)
contig_tab
       DESeq2
DEXSeq  FALSE  TRUE
  FALSE 14243    22
  TRUE   2853   527

Fisher Test

ft <- fisher.test(contig_tab, alternative = "greater")
ft

    Fisher's Exact Test for Count Data

data:  contig_tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
 82.62826      Inf
sample estimates:
odds ratio 
  119.5491 
  • There is very strong overlap between genes detected by DEXSeq and DESeq2.

  • When DEXSeq identifies a gene with differential isoform usage, that gene is much more likely than random also to show differential expression at the gene level.

  • Power is different across methods—DEXSeq finds many more significant genes (isoform-level testing is more sensitive), but the ones shared with DESeq2 are highly enriched.

Compare to Differential Expression at Gene-set Level

BP_df = msigdbr(species = "human", category = "C5", subcategory = "BP")
BP_list = split(BP_df$ensembl_gene, BP_df$gs_name)
deseq_names <- sub("\\.\\d+$", "", OcularDesq$gene_id)
deseq_vals <- OcularDesq$stat
stats_deseq <- setNames(deseq_vals, deseq_names)

stats_deseq <- stats_deseq[!is.na(stats_deseq)]
stats_deseq <- sort(stats_deseq, decreasing = TRUE)
                        
dex_names <- sub("\\.\\d+$", "", dex_gene$gene_id)
dex_vals <- dex_gene$stat
stats_dex <- setNames(dex_vals, dex_names)

stats_dex <- stats_dex[!is.na(stats_dex)]
stats_dex <- sort(stats_dex, decreasing = TRUE)
fg_deseq <- fgseaMultilevel(pathways = BP_list, stats = stats_deseq,
                            minSize = 15, maxSize = 500, scoreType = "std")
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.4% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
dim(fg_deseq)
[1] 4244    8
fg_dex <- fgseaMultilevel(pathways = BP_list, stats = stats_dex,
                          minSize = 15, maxSize = 500, scoreType = "std")
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (1.92% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: There were 8 pathways for which P-values were not calculated properly due to
unbalanced (positive and negative) gene-level statistic values. For such
pathways pval, padj, NES, log2err are set to NA. You can try to increase the
value of the argument nPermSimple (for example set it nPermSimple = 10000)
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: For some of the pathways the P-values were likely overestimated. For such
pathways log2err is set to NA.
fg_dex_ocular <- fg_dex
dim(fg_dex)
[1] 3734    8
sig_deseq <- fg_deseq  |>  filter(padj < alpha)  |>  arrange(padj)
sig_dex <- fg_dex  |>  filter(padj < alpha) |>  arrange(padj)

n_sig_de <- nrow(sig_deseq)
n_sig_dx <- nrow(sig_dex)

# shared pathways (by name)
shared <- intersect(sig_deseq$pathway, sig_dex$pathway)
n_shared <- length(shared)

n_sig_de
[1] 1075
n_sig_dx
[1] 5
n_shared
[1] 2
shared
[1] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"
[2] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"              
de_df <- fg_deseq |> transmute(pathway, NES_de = NES, padj_de = padj)
dx_df <- fg_dex |> transmute(pathway, NES_dx = NES, padj_dx = padj)

# Merge on pathway (shared sets only)
cmp <- inner_join(de_df, dx_df, by = "pathway") %>%
  mutate(sig_cat = case_when(
    padj_de < alpha & padj_dx < alpha ~ "Both",
    padj_de < alpha & padj_dx >= alpha ~ "DESeq2 only",
    padj_de >= alpha & padj_dx < alpha ~ "DEXSeq only",
    TRUE ~ "None"
  )) %>%
  mutate(sig_cat = factor(sig_cat, levels = c("Both","DESeq2 only","DEXSeq only","None")))


cmp
                                                         pathway     NES_de
                                                          <char>      <num>
   1:                      GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS  1.6815239
   2: GOBP_3_PHOSPHOADENOSINE_5_PHOSPHOSULFATE_METABOLIC_PROCESS -1.2756852
   3:                     GOBP_3_UTR_MEDIATED_MRNA_STABILIZATION  1.3086038
   4:                       GOBP_ACETYL_COA_BIOSYNTHETIC_PROCESS  1.7231721
   5:                          GOBP_ACETYL_COA_METABOLIC_PROCESS  1.7733249
  ---                                                                      
3655:                          GOBP_XENOBIOTIC_METABOLIC_PROCESS  1.2992393
3656:                                  GOBP_XENOBIOTIC_TRANSPORT  0.6645772
3657:                                  GOBP_ZINC_ION_HOMEOSTASIS -1.3121420
3658:                                    GOBP_ZINC_ION_TRANSPORT -1.3287925
3659:                                    GOBP_ZYMOGEN_ACTIVATION  0.8576990
         padj_de    NES_dx   padj_dx     sig_cat
           <num>     <num>     <num>      <fctr>
   1: 0.05686885 0.9598756 1.0000000        None
   2: 0.33066785 1.0705436 1.0000000        None
   3: 0.28263335 0.9706749 1.0000000        None
   4: 0.05435541 1.1690774 1.0000000        None
   5: 0.03164361 1.0416604 1.0000000 DESeq2 only
  ---                                           
3655: 0.20220078 0.9585218 1.0000000        None
3656: 0.95565365 1.1842482 1.0000000        None
3657: 0.24151258 1.5104315 0.5993666        None
3658: 0.25098715 1.5625600 0.3884447        None
3659: 0.79088753 1.0223987 1.0000000        None
# Scatter + smooth
ggplot(cmp, aes(x = NES_de, y = NES_dx, color = sig_cat)) +
  geom_point(alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "NES (DESeq2 fgsea)",
    y = "NES (DEXSeq fgsea)",
    color = "Significance",
    title = "NES comparison: DESeq2 vs DEXSeq (fgseaMultilevel)"
  ) +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 8 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_point()`).

save(OcularResult, file="data/Ocular_dxdResult.RData")

Spinal Motor Neurons

Load Data

isoCm_Spinal <- read.csv("data/SpinalIsoformCountMatrix.csv", row.names = 1)
Spinal_Metadata <- read.csv("data/SpinalSampleMetadata.csv", row.names = 1)
IsoformInfo <- read.csv("data/isoformInfo.csv")
SpinalResult <- dexseq_analysis(isoCm_Spinal, Spinal_Metadata, IsoformInfo)
converting counts to integer mode
Warning in DESeqDataSet(rse, design, ignoreRank = TRUE): some variables in
design formula are characters, converting to factors

Compare to Differential Expression:

alpha <- 0.05  # example threshold

SpinalResultGene <- SpinalResult |> 
  as.data.frame() |> 
  as_tibble() |>
  filter(!is.na(pvalue)) |> 
  select(
    gene_id = groupID,
    isoform_id = featureID,
    stat,
    log2FC = log2fold_SOD1_C9ORF72,
    pvalue,
    padj
  ) |> 
  group_by(gene_id) |> 
  slice_min(
    order_by = pvalue,
    n = 1,
    with_ties = FALSE
  )  |> 
  ungroup()  |> 
  mutate(
    padj = p.adjust(pvalue, method = 'fdr')
  ) |>
  filter(!is.na(padj))
dex_gene <- SpinalResultGene |> 
  mutate(gene_id,
         dex_padj = padj,
         dex_sig = !is.na(padj) & padj < alpha)
load("data/Spinal_dseqResult.RData")
SpinalDesq <- res_spinal
SpinalDesq$padj <- as.numeric(SpinalDesq$padj)
SpinalDesq <- SpinalDesq |>
  as.data.frame() |> 
  rownames_to_column("gene_id") |>
  mutate(deseq_sig = !is.na(padj) & padj < alpha) |>
  filter(!is.na(padj))
# Gene universe = genes present in BOTH results
univ <- intersect(dex_gene$gene_id, SpinalDesq$gene_id)
both <- dex_gene %>%
  filter(gene_id %in% univ) %>%
  mutate(gene_id, dex_sig) %>%
  inner_join(SpinalDesq %>%
    filter(gene_id %in% univ) %>%
    mutate(gene_id, deseq_sig), by = "gene_id")

contig_tab <- table(DEXSeq = both$dex_sig, DESeq2 = both$deseq_sig)
contig_tab
       DESeq2
DEXSeq  FALSE TRUE
  FALSE  9609  151
  TRUE   5717 2569

Fisher Test

ft <- fisher.test(contig_tab, alternative = "greater")
ft

    Fisher's Exact Test for Count Data

data:  contig_tab
p-value < 2.2e-16
alternative hypothesis: true odds ratio is greater than 1
95 percent confidence interval:
 24.80359      Inf
sample estimates:
odds ratio 
  28.59148 

Compare to Differential Expression at Gene-set Level

BP_df = msigdbr(species = "human", category = "C5", subcategory = "BP")
BP_list = split(BP_df$ensembl_gene, BP_df$gs_name)
deseq_names <- sub("\\.\\d+$", "", SpinalDesq$gene_id)
deseq_vals <- SpinalDesq$stat
stats_deseq <- setNames(deseq_vals, deseq_names)

stats_deseq <- stats_deseq[!is.na(stats_deseq)]
stats_deseq <- sort(stats_deseq, decreasing = TRUE)
                        
dex_names <- sub("\\.\\d+$", "", dex_gene$gene_id)
dex_vals <- dex_gene$stat
stats_dex <- setNames(dex_vals, dex_names)

stats_dex <- stats_dex[!is.na(stats_dex)]
stats_dex <- sort(stats_dex, decreasing = TRUE)
fg_deseq <- fgseaMultilevel(pathways = BP_list, stats = stats_deseq,
                            minSize = 15, maxSize = 500, scoreType = "std")
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.9% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
fg_dex <- fgseaMultilevel(pathways = BP_list, stats = stats_dex,
                          minSize = 15, maxSize = 500, scoreType = "std")
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (2.02% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
gseaParam, : All values in the stats vector are greater than zero and scoreType
is "std", maybe you should switch to scoreType = "pos".
Warning in fgseaMultilevel(pathways = BP_list, stats = stats_dex, minSize = 15,
: There were 5 pathways for which P-values were not calculated properly due to
unbalanced (positive and negative) gene-level statistic values. For such
pathways pval, padj, NES, log2err are set to NA. You can try to increase the
value of the argument nPermSimple (for example set it nPermSimple = 10000)
fg_dex_spinal <- fg_dex
sig_deseq <- fg_deseq %>% filter(padj < alpha) %>% arrange(padj)
sig_dex <- fg_dex %>% filter(padj < alpha) %>% arrange(padj)
n_sig_de <- nrow(sig_deseq)
n_sig_dx <- nrow(sig_dex)
# shared pathways (by name)
shared <- intersect(sig_deseq$pathway, sig_dex$pathway)
n_shared <- length(shared)

n_sig_de
[1] 635
n_sig_dx
[1] 2
n_shared
[1] 2
shared
[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"                   
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"
de_df <- fg_deseq %>% transmute(pathway, NES_de = NES, padj_de = padj)
dx_df <- fg_dex %>% transmute(pathway, NES_dx = NES, padj_dx = padj)
# Merge on pathway (shared sets only)
cmp <- inner_join(de_df, dx_df, by = "pathway") %>%
  mutate(sig_cat = case_when(
    padj_de < alpha & padj_dx < alpha ~ "Both",
    padj_de < alpha & padj_dx >= alpha ~ "DESeq2 only",
    padj_de >= alpha & padj_dx < alpha ~ "DEXSeq only",
    TRUE ~ "None"
  )) %>%
  mutate(sig_cat = factor(sig_cat, levels = c("Both","DESeq2 only","DEXSeq only","None")))
# Scatter + smooth
ggplot(cmp, aes(x = NES_de, y = NES_dx, color = sig_cat)) +
geom_point(alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "NES (DESeq2 fgsea)",
y = "NES (DEXSeq fgsea)",
color = "Significance",
title = "NES comparison: DESeq2 vs DEXSeq (fgseaMultilevel)"
) +
theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_point()`).

save(SpinalResult, file="data/Spinal_dxdResult.RData")
iso_id_padj_value_ocular = data.frame("transcript_id" = OcularResult$featureID,
                                       "padj" = OcularResult$padj)

iso_id_padj_value_spinal = data.frame("transcript_id" = SpinalResult$featureID,
                                       "padj" = SpinalResult$padj)

write.csv(iso_id_padj_value_ocular, "data/iso_id_padj_value_ocular.csv")
write.csv(iso_id_padj_value_spinal, "data/iso_id_padj_value_spinal.csv")

Investigation of Significant Pathways

Compare Pathways

head(OcularResultGene)
# A tibble: 6 × 6
  gene_id         isoform_id        stat log2FC   pvalue    padj
  <chr>           <chr>            <dbl>  <dbl>    <dbl>   <dbl>
1 ENSG00000000003 ENST00000612152  0.663 -1.38  0.718    0.802  
2 ENSG00000000419 ENST00000371582 15.4   -0.441 0.000459 0.00457
3 ENSG00000000457 ENST00000423670  4.38   0.632 0.112    0.262  
4 ENSG00000000460 ENST00000459772  2.89   0.462 0.236    0.410  
5 ENSG00000000971 ENST00000695981  2.25   2.06  0.324    0.494  
6 ENSG00000001036 ENST00000451668  5.16  -0.963 0.0757   0.205  
head(SpinalResultGene)
# A tibble: 6 × 6
  gene_id         isoform_id       stat log2FC   pvalue     padj
  <chr>           <chr>           <dbl>  <dbl>    <dbl>    <dbl>
1 ENSG00000000003 ENST00000496771  1.54 -0.221 4.63e- 1 5.43e- 1
2 ENSG00000000419 ENST00000371584 55.1  -1.42  1.10e-12 1.77e-11
3 ENSG00000000457 ENST00000470238  2.15  0.346 3.41e- 1 4.28e- 1
4 ENSG00000000460 ENST00000413811  2.43  0.235 2.97e- 1 3.82e- 1
5 ENSG00000000971 ENST00000695986 33.2  15.0   6.05e- 8 5.45e- 7
6 ENSG00000001036 ENST00000367585 17.2  -0.705 1.81e- 4 8.18e- 4
fg_dex_spinal <- fg_dex_spinal |> mutate(sig_spinal = !is.na(padj) & padj < alpha)

fg_dex_ocular <- fg_dex_ocular |> mutate(sig_ocular = !is.na(padj) & padj < alpha)
both <- fg_dex_spinal |>
          left_join(fg_dex_ocular, by="pathway") |>
          select(pathway, sig_spinal, sig_ocular) |> 
          mutate(sig = case_when(
            sig_spinal & sig_ocular ~ "Both",
            sig_spinal ~ "Spinal only",
            sig_ocular ~ "Ocular only",
            TRUE ~ "None"
          )) 
both |> group_by(sig) |> count()
# A tibble: 3 × 2
# Groups:   sig [3]
  sig             n
  <chr>       <int>
1 None         3738
2 Ocular only     5
3 Spinal only     2

Pathways enriched in _ compared to _ [Spinal]

both |> filter(sig_spinal) |> pull(pathway)
[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"                   
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"

Pathways enriched in _ compared to _ [Ocular]

both |> filter(sig_ocular) |> pull(pathway)
[1] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"              
[2] "GOBP_ARGININE_METABOLIC_PROCESS"                      
[3] "GOBP_GLUTAMINE_FAMILY_AMINO_ACID_CATABOLIC_PROCESS"   
[4] "GOBP_NITRIC_OXIDE_MEDIATED_SIGNAL_TRANSDUCTION"       
[5] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"

Pathways enriched only in Ocular Motor Neurons

both |> filter(sig=='Ocular only') |> pull(pathway)
[1] "GOBP_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"              
[2] "GOBP_ARGININE_METABOLIC_PROCESS"                      
[3] "GOBP_GLUTAMINE_FAMILY_AMINO_ACID_CATABOLIC_PROCESS"   
[4] "GOBP_NITRIC_OXIDE_MEDIATED_SIGNAL_TRANSDUCTION"       
[5] "GOBP_REGULATION_OF_ACTIVIN_RECEPTOR_SIGNALING_PATHWAY"

Pathways enriched only in Spinal Motor Neurons

both |> filter(sig=='Spinal only') |> pull(pathway)
[1] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION"                   
[2] "GOBP_ANTIGEN_PROCESSING_AND_PRESENTATION_OF_PEPTIDE_ANTIGEN"

Pathways enriched in Both Motor Neurons

both |> filter(sig=='Both') |> pull(pathway)
character(0)